import pickle
import numpy as np
import pandas as pd
import os, glob
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold, cross_validate
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
## The following are the ML models which can be used for trasinning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import timeit
import warnings
warnings.filterwarnings("ignore")
E:\Anaconda3\envs\tf\lib\site-packages\xgboost\compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. from pandas import MultiIndex, Int64Index
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
%matplotlib inline
import pandas as pd
import seaborn as sns
sns.set(style="darkgrid")
sns.set_context('talk')
from IPython.display import display, HTML, Image
import matplotlib.colors as mcolors
dataFolder = os.getcwd()
group1_low =1.634
group2_low =0.673
group2_upper =1.634
group3_upper =0.673
## read the test file
data =pd.read_csv(os.path.join(dataFolder,'10_PC_02_LHS_5000_54854_01_s1_G.csv'))
data.columns =[col.strip() for col in data.columns]
data['ratio'] = data['b(CaO)']/data['b(SiO2)']
group1 = data[data['ratio']>=group1_low] # with Portlandite, no Amor-S1
group2 = data[(data['ratio']<=group2_upper) & (data['ratio']>=group2_low)] # no Portlandite, no Amor-S1
group3 = data[data['ratio']<=group3_upper] # no Portlandite, with Amor-S1
groups = [group1,group2,group3]
## read trained Model info:
modelInfo =pd.read_csv(os.path.join(dataFolder,'ModelTrainSummary_all.csv'))
modelInfo = modelInfo.iloc[:,1:4]
modelInfo
| group | var | modelType | |
|---|---|---|---|
| 0 | group1 | Vol(aq) | rfModel |
| 1 | group1 | Vol(aq) | lgbModel |
| 2 | group1 | Vol(aq) | gbModel |
| 3 | group1 | Vol(aq) | GPyModel |
| 4 | group1 | pH | CONST |
| ... | ... | ... | ... |
| 73 | group3 | nCa(CSHQ) | linear |
| 74 | group3 | nSi(CSHQ) | linear |
| 75 | group3 | nH2O(CSHQ) | linear |
| 76 | group3 | C/S(CSHQ) | CONST |
| 77 | group3 | nGelPW(CSH) | linear |
78 rows × 3 columns
modelTypeLst = modelInfo['modelType'].unique()
MLModels =[ml for ml in modelTypeLst if 'Model' in ml]
nonMLModels = [ml for ml in modelTypeLst if 'Model' not in ml]
## load the mdoels
dataCols = data.columns.to_list()
modelsLst = []
for irow,row in modelInfo.iterrows():
group = row['group']
var =row['var']
modelType = row['modelType']
icol =dataCols.index(var)
if '/' in var:
fileCol = var.replace("/",'')
else:
fileCol = var
modelFolder = os.path.join(dataFolder,'SavedModel',group)
if modelType=='CONST':
ext = '.csv'
fileName = os.path.join(modelFolder,modelType + '_'+str(icol)+'_'+fileCol+ext)
tempModel =pd.read_csv(fileName)
else:
ext ='.sav'
fileName = os.path.join(modelFolder,modelType + '_'+str(icol)+'_'+fileCol+ext)
tempModel = pickle.load(open(fileName, 'rb'))
modelsLst.append(tempModel)
modelInfo['modelObj'] = modelsLst
finalDF = pd.DataFrame()
for ML in MLModels:
modelProcessing =[]
modelProcessing = nonMLModels.copy()
modelProcessing.append(ML)
tempModelInfo = modelInfo[modelInfo['modelType'].isin(modelProcessing)]
for col in group1.columns[4:-1]: # exclude 'ratio'
colModelInfo = tempModelInfo[tempModelInfo['var'] == col]
for i, group in enumerate(groups):
dataX = group.iloc[:,1:4]
dataY = group[col].values
groupModelInfo = colModelInfo[colModelInfo['group'] =='group'+str(i+1)]
modelType = groupModelInfo['modelType'].values[0]
model = groupModelInfo['modelObj'].values[0]
if modelType=='CONST':
y_pred = list(model['const'].values)*len(dataY)
elif modelType=='linear':
y_pred = model.predict(dataX.values)
else:
scaler = StandardScaler().fit(dataX.values)
X_scaled = scaler.transform(dataX.values) # This will be used for input of trainning dataset
y_pred = model.predict(X_scaled)
tempDF = pd.DataFrame({'testDataY':dataY, 'predDataY':y_pred})
tempDF['modelType'] = [modelType]*len(tempDF)
tempDF['var'] = [col]*len(tempDF)
tempDF['group'] = ['group'+str(i+1)]*len(tempDF)
tempDF['modelSimulation'] = [ML+'_simulation']*len(tempDF)
if len(finalDF)==0:
finalDF = tempDF
else:
finalDF = pd.concat([finalDF,tempDF],axis=0)
finalDF.to_csv('Comparison_predict_5KDataset.csv',index=False)
def drawPlots(trgVar,testDataY,predDataY,simulation):
fig, ax =plt.subplots(figsize=(8,8))
RMSE = mean_squared_error(testDataY,predDataY)
R2 = r2_score(testDataY,predDataY)
axMax = max(max(testDataY),max(predDataY))
axMin = min(min(testDataY),min(predDataY))
## Fitting the line
x2= np.linspace(axMin,axMax,30);
y2 = x2
ax.plot(testDataY,predDataY,'bo',markerfacecolor = 'blue',markeredgecolor ='darkblue',markeredgewidth=0.5,label=trgVar)
#ax.plot(df3['b(SiO2)'].values,df3['b(CaO)'].values,'bo',markerfacecolor = 'c',markeredgecolor ='b',markeredgewidth=0.5,label='no Amor-S1')
#ax.plot(group3['b(SiO2)'].values,group3['b(CaO)'].values,'g^',markerfacecolor = 'm',markeredgecolor ='m',markeredgewidth=1,label='group3')
ax.plot(x2,y2,'r-',lw=3,label ='1:1 ratio')
ax.legend (loc='best',ncol=5)
ax.set_xlim(axMin,axMax)
ax.set_ylim(axMin,axMax)
#ax.text(0.05, 0.8, 'Above the line with Portlandite',fontsize = 20,color = 'k')
#ax.text(0.4, 0.50, 'Below the line no Portlandite',fontsize = 20,color = 'k')
ax.set_title(simulation+ ' ====> '+trgVar)
ax.set_xlabel('testDataY')
ax.set_ylabel('PredDataY')
return R2, RMSE
modelSimulations = finalDF['modelSimulation'].unique()
trgVars = finalDF['var'].unique()
statMeasure = []
for simulation in modelSimulations:
if 'MLP' in simulation or 'xgb' in simulation: # skip the xgb and MLP models
continue
tempDF = finalDF[finalDF['modelSimulation']==simulation]
for trgVar in trgVars:
df = tempDF[tempDF['var']==trgVar] # no Portlandite
testDataY = df['testDataY'].values
predDataY = df['predDataY'].values
R2,RMSE = drawPlots(trgVar,testDataY,predDataY,simulation)
statMeasure.append([trgVar,R2,RMSE,simulation])
statMeasDF = pd.DataFrame(statMeasure,columns=['Var','R2','RMSE','modelSimulation'])
statMeasDF
| Var | R2 | RMSE | modelSimulation | |
|---|---|---|---|---|
| 0 | Vol(aq) | 0.991504 | 8.080633e-06 | rfModel_simulation |
| 1 | pH | 0.990699 | 7.314908e-03 | rfModel_simulation |
| 2 | nCa(aq) | 0.969484 | 1.660522e-08 | rfModel_simulation |
| 3 | nCa(s) | 1.000000 | 1.601451e-08 | rfModel_simulation |
| 4 | nSi(aq) | 0.978627 | 3.207269e-10 | rfModel_simulation |
| 5 | nSi(s_reac) | 1.000000 | 2.703401e-10 | rfModel_simulation |
| 6 | nPortlandite | 0.995254 | 7.027868e-04 | rfModel_simulation |
| 7 | nAmor-Sl | 0.961508 | 2.075204e-04 | rfModel_simulation |
| 8 | mCSHQ | 0.998835 | 1.042550e-06 | rfModel_simulation |
| 9 | nCa(CSHQ) | 1.000000 | 1.716375e-08 | rfModel_simulation |
| 10 | nSi(CSHQ) | 1.000000 | 2.703401e-10 | rfModel_simulation |
| 11 | nH2O(CSHQ) | 0.998887 | 2.102353e-04 | rfModel_simulation |
| 12 | C/S(CSHQ) | 0.987923 | 1.549080e-03 | rfModel_simulation |
| 13 | nGelPW(CSH) | 0.999050 | 3.530165e-05 | rfModel_simulation |
| 14 | Vol(aq) | 0.991977 | 7.631142e-06 | lgbModel_simulation |
| 15 | pH | 0.978397 | 1.698957e-02 | lgbModel_simulation |
| 16 | nCa(aq) | 0.962786 | 2.025020e-08 | lgbModel_simulation |
| 17 | nCa(s) | 1.000000 | 1.601451e-08 | lgbModel_simulation |
| 18 | nSi(aq) | 0.982556 | 2.617732e-10 | lgbModel_simulation |
| 19 | nSi(s_reac) | 1.000000 | 2.703401e-10 | lgbModel_simulation |
| 20 | nPortlandite | 0.994461 | 8.201820e-04 | lgbModel_simulation |
| 21 | nAmor-Sl | 0.883784 | 6.265463e-04 | lgbModel_simulation |
| 22 | mCSHQ | 0.995146 | 4.342887e-06 | lgbModel_simulation |
| 23 | nCa(CSHQ) | 1.000000 | 1.716375e-08 | lgbModel_simulation |
| 24 | nSi(CSHQ) | 1.000000 | 2.703401e-10 | lgbModel_simulation |
| 25 | nH2O(CSHQ) | 0.994282 | 1.080325e-03 | lgbModel_simulation |
| 26 | C/S(CSHQ) | 0.970519 | 3.781303e-03 | lgbModel_simulation |
| 27 | nGelPW(CSH) | 0.999050 | 3.530165e-05 | lgbModel_simulation |
| 28 | Vol(aq) | 0.990172 | 9.348100e-06 | gbModel_simulation |
| 29 | pH | 0.977410 | 1.776561e-02 | gbModel_simulation |
| 30 | nCa(aq) | 0.955208 | 2.437375e-08 | gbModel_simulation |
| 31 | nCa(s) | 1.000000 | 1.601451e-08 | gbModel_simulation |
| 32 | nSi(aq) | 0.984079 | 2.389144e-10 | gbModel_simulation |
| 33 | nSi(s_reac) | 1.000000 | 2.703401e-10 | gbModel_simulation |
| 34 | nPortlandite | 0.989438 | 1.563931e-03 | gbModel_simulation |
| 35 | nAmor-Sl | 0.816584 | 9.888377e-04 | gbModel_simulation |
| 36 | mCSHQ | 0.992656 | 6.570474e-06 | gbModel_simulation |
| 37 | nCa(CSHQ) | 1.000000 | 1.716375e-08 | gbModel_simulation |
| 38 | nSi(CSHQ) | 1.000000 | 2.703401e-10 | gbModel_simulation |
| 39 | nH2O(CSHQ) | 0.992220 | 1.469860e-03 | gbModel_simulation |
| 40 | C/S(CSHQ) | 0.965003 | 4.488907e-03 | gbModel_simulation |
| 41 | nGelPW(CSH) | 0.999050 | 3.530165e-05 | gbModel_simulation |
| 42 | Vol(aq) | 0.999616 | 3.655889e-07 | GPyModel_simulation |
| 43 | pH | 0.996629 | 2.650814e-03 | GPyModel_simulation |
| 44 | nCa(aq) | 0.998023 | 1.075643e-09 | GPyModel_simulation |
| 45 | nCa(s) | 1.000000 | 1.601451e-08 | GPyModel_simulation |
| 46 | nSi(aq) | 0.978039 | 3.295589e-10 | GPyModel_simulation |
| 47 | nSi(s_reac) | 1.000000 | 2.703401e-10 | GPyModel_simulation |
| 48 | nPortlandite | 0.999563 | 6.472451e-05 | GPyModel_simulation |
| 49 | nAmor-Sl | 0.983547 | 8.870394e-05 | GPyModel_simulation |
| 50 | mCSHQ | 0.999850 | 1.341174e-07 | GPyModel_simulation |
| 51 | nCa(CSHQ) | 1.000000 | 1.716375e-08 | GPyModel_simulation |
| 52 | nSi(CSHQ) | 1.000000 | 2.703401e-10 | GPyModel_simulation |
| 53 | nH2O(CSHQ) | 0.999803 | 3.731310e-05 | GPyModel_simulation |
| 54 | C/S(CSHQ) | 0.999396 | 7.748488e-05 | GPyModel_simulation |
| 55 | nGelPW(CSH) | 0.999050 | 3.530165e-05 | GPyModel_simulation |
statMeasDF_pivot=statMeasDF.pivot(index='Var',values=['R2','RMSE'],columns='modelSimulation')
statMeasDF.to_csv('statMeasure_trainDataset_noPivot.csv',index = False)
statMeasDF_pivot.to_csv('statMeasure_trainDataset_Pivot.csv')
## alternative way to draw figures
def drawCombiningPlots(plotDataDF,trgVar):
fig, ax =plt.subplots(figsize=(10,10))
marker = ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X']
colors = ['tab:blue','tab:orange','tab:gray','tab:olive','tab:cyan','tab:green']
## Fitting the line
axisMin=1000
axisMax =-1000
for i, col in enumerate(plotDataDF.columns[1:]):
testDataY = plotDataDF.iloc[:,0].values
predDataY = plotDataDF[col].values
axMax = max(max(testDataY),max(predDataY))
axMin = min(min(testDataY),min(predDataY))
axisMin = min(axisMin,axMin)
axisMax = max(axisMax,axMax)
ax.scatter(testDataY,predDataY,marker =marker[i],edgecolor =colors[i],label=col.replace('_simulation',''))
x2= np.linspace(axisMin,axisMax,30);
y2 = x2
ax.plot(x2,y2,'r-',lw=3,label ='1:1 ratio')
ax.legend (loc='best',ncol=3)
ax.set_xlim(axisMin,axisMax)
ax.set_ylim(axisMin,axisMax)
ax.set_title(trgVar)
ax.set_xlabel('testDataY')
ax.set_ylabel('PredDataY')
modelSimulations = finalDF['modelSimulation'].unique()
trgVars = finalDF['var'].unique()
for trgVar in trgVars:
tempdf = finalDF[finalDF['var']==trgVar] # no Portlandite
dataplot = {}
plotDataDF = pd.DataFrame()
for simulation in modelSimulations:
# skip the xgb and MLP models
# the two models are very bad, need to be fine tune
if 'MLP' in simulation or 'xgb' in simulation:
continue
df = tempdf[tempdf['modelSimulation']==simulation]
if len(plotDataDF)==0:
plotDataDF=df[['testDataY','predDataY']]
else:
plotDataDF = pd.concat([plotDataDF,df[['predDataY']]],axis=1)
plotDataDF.rename({'predDataY': simulation}, axis=1,inplace=True)
drawCombiningPlots(plotDataDF,trgVar)